library(tidyverse); library(cowplot)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.7     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.1     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(phyloseq); library(decontam)
library(plotly); library(lubridate)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:cowplot':
## 
##     stamp
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

1 Merged data from QIIME2 analysis

Run below to repeat ASV compilation from raw QIIME2 output.

# merged_tax <- read_delim("/Users/sarahhu/Desktop/Projects/microeuks_deepbiosphere_datamine/microeuk-amplicon-survey/data-input/taxonomy.tsv", delim = "\t")
# merged_asv <- read_delim("/Users/sarahhu/Desktop/Projects/microeuks_deepbiosphere_datamine/microeuk-amplicon-survey/data-input/microeuk-merged-asv-table.tsv", delim = "\t", skip = 1)
# metadata <- read.delim("/Users/sarahhu/Desktop/Projects/microeuks_deepbiosphere_datamine/microeuk-amplicon-survey/data-input/samplelist-metadata.txt")
# 
# asv_wtax <- merged_asv %>%
#   select(FeatureID = '#OTU ID', everything()) %>%
#   pivot_longer(cols = !FeatureID,
#                names_to = "SAMPLE", values_to = "value") %>%
#   left_join(merged_tax, by = c("FeatureID" = "Feature ID")) %>%
#   left_join(metadata) %>%
#   filter(SITE == "GordaRidge" | SITE == "substrate" | SITE == "Laboratory") %>%
#   filter(!grepl("Siders_", SAMPLE)) %>%
#   filter(!(grepl("T0", SAMPLE))) %>%
#   filter(!(grepl("T24", SAMPLE))) %>%
#   filter(!(grepl("T36", SAMPLE))) %>%
#   mutate(DATASET = case_when(
#     grepl("_GR_", SAMPLE) ~ "GR",
#     grepl("Gorda", SAMPLE) ~ "GR",
#     grepl("_MCR_", SAMPLE) ~ "MCR",
#     grepl("Axial", SAMPLE) ~ "Axial",
#   TRUE ~ "Control or blank")) %>%
#     separate(Taxon, c("Domain", "Supergroup",
#                   "Phylum", "Class", "Order",
#                   "Family", "Genus", "Species"), sep = ";", remove = FALSE)
# 
# # fix naming, some controls sequenced separately.
# gr_substrate_fluid_asvs <- asv_wtax %>%
#   mutate(SAMPLE_tmp = case_when(
#     Sample_actual == "" ~ SAMPLE,
#     TRUE ~ Sample_actual
#   )) %>%
#   select(-SAMPLE) %>%
#   select(SAMPLE = SAMPLE_tmp, everything()) %>%
#   filter(value > 0)

# View(gr_substrate_fluid_asvs)
# View(gr_substrate_fluid_asvs %>% filter(Sample_or_Control == "Control") %>% select(SAMPLE) %>% distinct())
# save(gr_substrate_fluid_asvs, file = "/Users/sarahhu/Desktop/Projects/GordaRidge-microcolonizers/microcolonizers-GordaRidge/input-data/asv-table.RData")

2 Import microcolonizer data

load("input-data/asv-table.RData", verbose = TRUE)
## Loading objects:
##   gr_substrate_fluid_asvs

2.0.1 Quick look - unprocessed data

Barplots to show total number of sequences and total number of ASVs

plot_grid(gr_substrate_fluid_asvs %>% 
  filter(value > 0) %>% 
  ggplot(aes(x = SAMPLE)) +
  geom_bar(stat = "count", width = 0.9) +
  labs(y = "Total ASVs") +
  coord_flip() +
  theme_linedraw() +
  facet_grid(DATASET ~ ., scale = "free", space = "free") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 1),
        strip.background = element_blank(), strip.text = element_text(color = "black")),
  gr_substrate_fluid_asvs %>% 
  group_by(SAMPLE, SITE, Domain, DATASET) %>% 
  summarise(SUM_SEQ_DOMAIN = sum(value)) %>% 
  ggplot(aes(x = SAMPLE, y = SUM_SEQ_DOMAIN, fill = Domain)) +
  geom_bar(stat = "identity", color = "black", width = 0.9) +
  labs(y = "Total Sequences") +
  coord_flip() +
  theme_linedraw() +
  facet_grid(DATASET ~ ., scale = "free", space = "free") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 1),
        strip.background = element_blank(), strip.text = element_text(color = "black"),
        legend.position = "bottom"),
  ncol = 2)
## `summarise()` has grouped output by 'SAMPLE', 'SITE', 'Domain'. You can
## override using the `.groups` argument.

2.0.2 Table reporting total ASVs and sequences

# View(gr_substrate_fluid_asvs %>% filter(value > 0) %>% 
#        group_by(SAMPLE, DATASET, SITE) %>% 
#        summarise(SEQ_SUM = sum(value),
#                  ASV_COUNT = n()))

2.1 Decontaminate sequence library

Import sample description text file, import as phyloseq library, and remove potential contaminate ASVs and sequences. Catalog total number of ASVs and sequences removed from analysis.

2.1.1 Import as phyloseq objects

# head(gr_substrate_fluid_asvs)
tax_matrix <- gr_substrate_fluid_asvs %>% 
  select(FeatureID, Taxon) %>% 
  distinct() %>% 
  separate(Taxon, c("Domain", "Supergroup", 
                  "Phylum", "Class", "Order",
                  "Family", "Genus", "Species"), sep = ";", remove = FALSE) %>% 
  column_to_rownames(var = "FeatureID") %>% 
  as.matrix
## Warning: Expected 8 pieces. Additional pieces discarded in 3115 rows [3, 4, 6,
## 9, 11, 12, 13, 16, 21, 25, 28, 30, 33, 38, 42, 44, 47, 49, 50, 52, ...].
## Warning: Expected 8 pieces. Missing pieces filled with `NA` in 2988 rows [1, 2,
## 5, 7, 8, 10, 14, 15, 17, 18, 19, 20, 22, 23, 24, 26, 27, 29, 31, 32, ...].
asv_matrix <- gr_substrate_fluid_asvs %>% 
  select(FeatureID, SAMPLE, value) %>% 
  pivot_wider(names_from = "SAMPLE", values_fill = 0, values_from = value) %>% 
  column_to_rownames(var = "FeatureID") %>% 
  as.matrix

# Align row names for each matrix
rownames(tax_matrix) <- row.names(asv_matrix)

metadata_cones <- gr_substrate_fluid_asvs %>% 
  select(SAMPLE, Type, VENT, SITE, SAMPLETYPE, Sample_or_Control) %>% 
  distinct() %>% 
  column_to_rownames(var = "SAMPLE")
# Import asv and tax matrices
ASV = otu_table(asv_matrix, taxa_are_rows = TRUE)
TAX = tax_table(tax_matrix)
phylo_obj <- phyloseq(ASV, TAX)

# Import metadata as sample data in phyloseq
samplenames <- sample_data(metadata_cones)

# join as phyloseq object
physeq_wnames = merge_phyloseq(phylo_obj, samplenames)
# colnames(ASV)

## Check
# physeq_wnames
# View(gr_substrate_fluid_asvs %>% filter(value > 0))

2.1.2 Identify contaminant ASVs

In addition to shipboard milliQ blank samples, each substrate type had a ‘blank’ control, which was sampled at the same time, but never deployed in the microcolonizers (only processed at the same time in the lab).

# When "Control" appears in "Sample_or_Control column, this is a negative control"
sample_data(physeq_wnames)$is.neg <- sample_data(physeq_wnames)$Sample_or_Control == "Control"
# ID contaminants using Prevalence information
contam_prev <- isContaminant(physeq_wnames, 
                               method="prevalence", 
                               neg="is.neg", 
                               threshold = 0.5, normalize = TRUE) 

# Report number of ASVs IDed as contamintants
table(contam_prev$contaminant)
## 
## FALSE  TRUE 
##  6060    43

0.5 - this threshold will ID contaminants in all samples that are more prevalent in negative controls than in positive samples.

2.1.3 Remove problematic ASVs

# Subset contaminant ASVs
contams <- filter(contam_prev, contaminant == "TRUE")
list_of_contam_asvs <- as.character(row.names(contams))
# length(list_of_contam_asvs)

taxa_contam <- as.data.frame(tax_matrix) %>% 
  rownames_to_column(var = "FeatureID") %>% 
  filter(FeatureID %in% list_of_contam_asvs)
# head(taxa_contam)
# View(asv_wtax)
asv_wtax_decon <- gr_substrate_fluid_asvs %>% 
  filter(!(FeatureID %in% list_of_contam_asvs)) %>% 
  filter(!(Sample_or_Control == "Control"))

tmp_orig <- (gr_substrate_fluid_asvs %>% filter(!(Sample_or_Control == "Control")))

# Stats on lost
x <- length(unique(tmp_orig$FeatureID)); x
## [1] 5912
y <- length(unique(asv_wtax_decon$FeatureID)); y
## [1] 5886
100*((y-x)/x) #0.43% of ASVs lost
## [1] -0.4397835
a <- sum(tmp_orig$value);a #3.1 million
## [1] 3155152
b <- sum(asv_wtax_decon$value);b #2.89 million 
## [1] 2981359
100*((b-a)/a)
## [1] -5.508229
# Lost 5.5% of sequences from whole dataset.

## Subsample to clean ASVs
asv_wtax_wstats <- gr_substrate_fluid_asvs %>% 
  mutate(DECONTAM = case_when(
    FeatureID %in% list_of_contam_asvs ~ "FAIL",
    TRUE ~ "PASS"
  ))
plot_grid(asv_wtax_wstats %>% 
  filter(value > 0) %>% 
  ggplot(aes(x = SAMPLE, fill = DECONTAM)) +
  geom_bar(stat = "count", width = 0.9, color = "black") +
  labs(y = "Total ASVs") +
  coord_flip() +
  theme_linedraw() +
  facet_grid(DATASET ~ ., scale = "free", space = "free") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 1),
        strip.background = element_blank(), strip.text = element_text(color = "black"),
        legend.position = "bottom"),
  asv_wtax_wstats %>% 
  group_by(SAMPLE, SITE, DECONTAM, DATASET) %>% 
  summarise(SUM_SEQ_DOMAIN = sum(value)) %>% 
  ggplot(aes(x = SAMPLE, y = SUM_SEQ_DOMAIN, fill = DECONTAM)) +
  geom_bar(stat = "identity", color = "black", width = 0.9) +
  labs(y = "Total Sequences") +
  coord_flip() +
  theme_linedraw() +
  facet_grid(DATASET ~ ., scale = "free", space = "free") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 1),
        strip.background = element_blank(), strip.text = element_text(color = "black"),
        legend.position = "bottom"),
  ncol = 2)
## `summarise()` has grouped output by 'SAMPLE', 'SITE', 'DECONTAM'. You can
## override using the `.groups` argument.

3 Subset Gorda Ridge Cones and in situ

Isolate Gorda Ridge substrate samples and the in situ samples from Mt Edwards, plume at Mt Edwards and the background seawater. Remove in situ

# colnames(asv_wtax_wstats)
# unique(asv_wtax_wstats$Sample_actual)
# Isolate blank substrate for GR cones

asv_GR <- asv_wtax_wstats %>% 
      filter(Sample_or_Control != "Control") %>% # rm controls
      filter(SITE == "substrate" | VENT == "Mt Edwards" | 
                VENT == "Mt Edwards Plume" | VENT == "Deep seawater" | VENT == "Shallow seawater") %>% 
      filter(DATASET != "Axial", DATASET != "MCR") %>% 
      filter(!grepl("_expTf_", SAMPLE)) %>% 
      filter(value > 0) %>% 
      filter(DECONTAM == "PASS") #%>% 
      # bind_rows(asv_substrate_blanks)

# Get quick stats on totals
sum(asv_GR$value) # 2.3 million sequences
## [1] 2366302
length(unique(asv_GR$FeatureID)) #3370 ASVs
## [1] 3370
# View(asv_GR)
# unique(asv_GR$VENT)

Additional sample QC, check replicates, and determine if replicates should be averaged.

plot_grid(asv_GR %>% 
  group_by(SAMPLE, VENT, DATASET, SAMPLETYPE, Domain) %>% 
  summarise(seqsum_var = sum(value),
            asvcount_var = n()) %>% 
  pivot_longer(ends_with("_var"), names_to = "VARIABLE") %>% 
  ggplot(aes(x = SAMPLE, y = value, fill = Domain)) +
    geom_bar(color = "black", stat = "identity", position = "fill") +
    facet_grid(VARIABLE ~ SAMPLETYPE, space = "free", scales = "free") +
  scale_y_continuous(expand = c(0,0)) +
  theme_linedraw() +
  scale_fill_brewer(palette = "Paired") +
  theme(strip.background = element_blank(), strip.text = element_text(color = "black"),
        axis.text.x = element_text(color = "black", angle = 90, hjust = 1, vjust = 0.5),
        legend.position = "bottom"),
  asv_GR %>% 
  group_by(SAMPLE, VENT, DATASET, SAMPLETYPE, Domain) %>% 
  summarise(seqsum_var = sum(value),
            asvcount_var = n()) %>% 
  pivot_longer(ends_with("_var"), names_to = "VARIABLE") %>% 
  ggplot(aes(x = SAMPLE, y = value, fill = Domain)) +
    geom_bar(color = "black", stat = "identity", position = "stack") +
    facet_grid(VARIABLE ~ SAMPLETYPE, space = "free_x", scales = "free") +
  scale_y_continuous(expand = c(0,0)) +
  theme_linedraw() +
  scale_fill_brewer(palette = "Paired") +
  theme(strip.background = element_blank(), strip.text = element_text(color = "black"),
        axis.text.x = element_text(color = "black", angle = 90, hjust = 1, vjust = 0.5),
        legend.position = "bottom"),
ncol = 2)
## `summarise()` has grouped output by 'SAMPLE', 'VENT', 'DATASET', 'SAMPLETYPE'.
## You can override using the `.groups` argument.
## `summarise()` has grouped output by 'SAMPLE', 'VENT', 'DATASET', 'SAMPLETYPE'.
## You can override using the `.groups` argument.

Based on above plot, remove “BSW020” sterivex filter.

3.1 Taxonomy bar plots

Supergroup distribution by background fluid vs. what was isolated from substrates.

Factor supergroup names

tmp <- asv_GR %>% 
  filter(Domain == "Eukaryota") %>%
  filter(Sample_or_Control != "Control") %>% 
  # unite(SampleIdentifier, VENT, SAMPLETYPE, sep = " ", remove = FALSE) %>% 
  mutate(Supergroup = ifelse(is.na(Supergroup), "Unknown Eukaryota", Supergroup),
         Phylum = ifelse(is.na(Phylum), "Unknown", Phylum),
         Phylum = ifelse(Phylum == "Alveolata_X", "Ellobiopsidae", Phylum),
         Supergroup = ifelse(Supergroup == "Alveolata", paste(Supergroup, Phylum, sep = "-"), Supergroup))
colors_tax <- c("#f1eef6", "#d7b5d8", "#df65b0", "#ce1256", "#fc9272", "#ef3b2c", 
    "#800026", "#fff7bc", "#fec44f", "#d95f0e", "#a63603", "#74c476", "#238b45", 
    "#00441b", "#7fcdbb", "#084081", "#c6dbef", "#2b8cbe", "#016c59", "#bcbddc", 
    "#807dba", "#54278f", "#bdbdbd", "black")

tax_names <- as.character(unique(tmp$Supergroup))
names(colors_tax) <- tax_names
# View(asv_GR)
by_supergroup_back <- asv_GR %>% 
  filter(Domain == "Eukaryota") %>%
  filter(Sample_or_Control != "Control") %>% 
  # unite(SampleIdentifier, VENT, SAMPLETYPE, sep = " ", remove = FALSE) %>% 
  mutate(Supergroup = ifelse(is.na(Supergroup), "Unknown Eukaryota", Supergroup),
         Phylum = ifelse(is.na(Phylum), "Unknown", Phylum),
         Phylum = ifelse(Phylum == "Alveolata_X", "Ellobiopsidae", Phylum),
         Supergroup = ifelse(Supergroup == "Alveolata", paste(Supergroup, Phylum, sep = "-"), Supergroup)) %>% 
  group_by(FeatureID, Taxon, Domain, Supergroup, Phylum, Class, Order, Family, Genus, Species,
           VENT, SITE, SAMPLETYPE, YEAR, DATASET, Type) %>% 
    summarise(SEQ_AVG_REP = mean(value)) %>% 
  ungroup() %>% 
  group_by(SITE, SAMPLETYPE, VENT, Supergroup, Type) %>% 
    summarise(SEQ_SUM = sum(SEQ_AVG_REP)) %>% 
  unite(Sample_substrate, VENT, Type, sep = " ") %>% 
  filter(SITE == "GordaRidge") %>% 
  ggplot(aes(x = Sample_substrate, y = SEQ_SUM, fill = Supergroup)) +
    geom_bar(stat = "identity", position = "fill", color = "black", width = 0.6) +
    facet_grid(. ~ SITE + SAMPLETYPE, space = "free_x", scales = "free") +
  theme_linedraw() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 1),
        strip.background = element_blank(), strip.text = element_text(color = "black")) +
  scale_y_continuous(expand = c(0,0)) +
  scale_fill_manual(values = colors_tax) +
  labs(title = "Microcolonizers and background", x = "", y = "Relative sequence abundance")
## `summarise()` has grouped output by 'FeatureID', 'Taxon', 'Domain',
## 'Supergroup', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species', 'VENT',
## 'SITE', 'SAMPLETYPE', 'YEAR', 'DATASET'. You can override using the `.groups`
## argument.
## `summarise()` has grouped output by 'SITE', 'SAMPLETYPE', 'VENT', 'Supergroup'.
## You can override using the `.groups` argument.
by_supergroup_substrate <- asv_GR %>% 
  filter(Domain == "Eukaryota") %>%
  filter(Sample_or_Control != "Control") %>% 
  # unite(SampleIdentifier, VENT, SAMPLETYPE, sep = " ", remove = FALSE) %>% 
  mutate(Supergroup = ifelse(is.na(Supergroup), "Unknown Eukaryota", Supergroup),
         Phylum = ifelse(is.na(Phylum), "Unknown", Phylum),
         Phylum = ifelse(Phylum == "Alveolata_X", "Ellobiopsidae", Phylum),
         Supergroup = ifelse(Supergroup == "Alveolata", paste(Supergroup, Phylum, sep = "-"), Supergroup)) %>% 
  group_by(FeatureID, Taxon, Domain, Supergroup, Phylum, Class, Order, Family, Genus, Species,
           VENT, SITE, SAMPLETYPE, YEAR, DATASET, Type) %>% 
    summarise(SEQ_AVG_REP = mean(value)) %>% 
  ungroup() %>% 
  group_by(SITE, SAMPLETYPE, VENT, Supergroup, Type) %>% 
    summarise(SEQ_SUM = sum(SEQ_AVG_REP)) %>% 
  unite(Sample_substrate, VENT, Type, sep = " ") %>% 
  filter(SITE != "GordaRidge") %>% 
  ggplot(aes(x = Sample_substrate, y = SEQ_SUM, fill = Supergroup)) +
    geom_bar(stat = "identity", position = "fill", color = "black", width = 0.6) +
    facet_grid(. ~ SITE + SAMPLETYPE, space = "free_x", scales = "free") +
  theme_linedraw() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 1),
        strip.background = element_blank(), strip.text = element_text(color = "black")) +
  scale_y_continuous(expand = c(0,0)) +
  scale_fill_manual(values = colors_tax) +
  labs(title = "Microcolonizers and background", x = "", y = "Relative sequence abundance")
## `summarise()` has grouped output by 'FeatureID', 'Taxon', 'Domain',
## 'Supergroup', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species', 'VENT',
## 'SITE', 'SAMPLETYPE', 'YEAR', 'DATASET'. You can override using the `.groups`
## argument.
## `summarise()` has grouped output by 'SITE', 'SAMPLETYPE', 'VENT', 'Supergroup'.
## You can override using the `.groups` argument.

3.1.1 Taxonomy plot of background fluid

plotly::ggplotly(by_supergroup_back)

3.1.2 Taxonomy plot of microcolonizers

plotly::ggplotly(by_supergroup_substrate)

3.1.3 Plot only metazoa

metazoa_substrate <- asv_GR %>% 
  filter(Domain == "Eukaryota") %>%
  filter(Sample_or_Control != "Control") %>% 
  mutate(Supergroup = ifelse(is.na(Supergroup), "Unknown Eukaryota", Supergroup),
         Phylum = ifelse(is.na(Phylum), "Unknown", Phylum),
         Phylum = ifelse(Phylum == "Alveolata_X", "Ellobiopsidae", Phylum),
         Supergroup = ifelse(Supergroup == "Alveolata", paste(Supergroup, Phylum, sep = "-"), Supergroup)) %>% 
  group_by(FeatureID, Taxon, Domain, Supergroup, Phylum, Class, Order, Family, Genus, Species,
           VENT, SITE, SAMPLETYPE, YEAR, DATASET, Type) %>% 
    summarise(SEQ_AVG_REP = mean(value)) %>% 
  # Opisthokonts only
  filter(Supergroup == "Opisthokonta") %>%
  unite(Phylum_Class, Phylum, Class, sep = "-") %>% 
  ungroup() %>% 
  group_by(SITE, SAMPLETYPE, VENT, Type, Supergroup, Phylum_Class) %>% 
    summarise(SEQ_SUM = sum(SEQ_AVG_REP)) %>% 
  unite(Sample_substrate, VENT, Type, sep = " ") %>% 
  ggplot(aes(x = Sample_substrate, y = SEQ_SUM, fill = Phylum_Class)) +
    geom_bar(stat = "identity", position = "fill", color = "black", width = 0.9) +
    facet_grid(. ~ SITE + SAMPLETYPE, scale = "free", space = "free") +
  theme_linedraw() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 1),
        strip.background = element_blank(), strip.text = element_text(color = "black")) +
  scale_y_continuous(expand = c(0,0)) +
  # scale_fill_manual(values = c("#f1eef6", "#d7b5d8", "#df65b0", "#ce1256", "#fc9272", "#ef3b2c", 
  #   "#800026", "#fff7bc", "#fec44f", "#d95f0e", "#a63603", "#74c476", "#238b45", 
  #   "#00441b", "#7fcdbb", "#084081", "#c6dbef", "#2b8cbe", "#016c59", "#bcbddc", 
  #   "#807dba", "#54278f", "#bdbdbd", "black")) +
  labs(title = "Average across replicates", x = "", y = "Relative sequence abundance")
## `summarise()` has grouped output by 'FeatureID', 'Taxon', 'Domain',
## 'Supergroup', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species', 'VENT',
## 'SITE', 'SAMPLETYPE', 'YEAR', 'DATASET'. You can override using the `.groups`
## argument.
## `summarise()` has grouped output by 'SITE', 'SAMPLETYPE', 'VENT', 'Type',
## 'Supergroup'. You can override using the `.groups` argument.
plotly::ggplotly(metazoa_substrate)

4 What ASVs are shared across samples?

library(ggupset)
asv_GR %>% 
  filter(Domain == "Eukaryota") %>%
  filter(Sample_or_Control != "Control") %>% 
  mutate(Supergroup = ifelse(is.na(Supergroup), "Unknown Eukaryota", Supergroup),
         Phylum = ifelse(is.na(Phylum), "Unknown", Phylum),
         Phylum = ifelse(Phylum == "Alveolata_X", "Ellobiopsidae", Phylum),
         Supergroup = ifelse(Supergroup == "Alveolata", paste(Supergroup, Phylum, sep = "-"), Supergroup)) %>% 
  group_by(FeatureID, Supergroup, 
           VENT, Type) %>% 
    summarise(SUM = sum(value)) %>% 
  ungroup() %>%
  unite(Sample_substrate, VENT, Type, sep = " ") %>% 
  distinct(FeatureID, Supergroup, SUM, Sample_substrate, .keep_all = TRUE) %>% 
  group_by(FeatureID, Supergroup) %>% 
  summarise(Sample_substrate = list(Sample_substrate)) %>% 
  ggplot(aes(x = Sample_substrate)) +
    geom_bar(color = "black", width = 0.8, aes(fill = Supergroup)) +
    scale_x_upset(n_intersections = 30)
## `summarise()` has grouped output by 'FeatureID', 'Supergroup', 'VENT'. You can
## override using the `.groups` argument.
## `summarise()` has grouped output by 'FeatureID'. You can override using the
## `.groups` argument.
## Warning: Removed 587 rows containing non-finite values (stat_count).

# unique(asv_GR$VENT)
asv_GR %>% 
  filter(Domain == "Eukaryota") %>%
  filter(!(Type == "")) %>% 
  filter(Sample_or_Control != "Control") %>% 
  mutate(Supergroup = ifelse(is.na(Supergroup), "Unknown Eukaryota", Supergroup),
         Phylum = ifelse(is.na(Phylum), "Unknown", Phylum),
         Phylum = ifelse(Phylum == "Alveolata_X", "Ellobiopsidae", Phylum),
         Supergroup = ifelse(Supergroup == "Alveolata", paste(Supergroup, Phylum, sep = "-"), Supergroup)) %>% 
  group_by(FeatureID, Supergroup, 
           VENT, Type) %>% 
    summarise(SUM = sum(value)) %>% 
  ungroup() %>%
  unite(Sample_substrate, VENT, Type, sep = " ") %>% 
  distinct(FeatureID, Supergroup, SUM, Sample_substrate, .keep_all = TRUE) %>% 
  group_by(FeatureID, Supergroup) %>% 
  summarise(Sample_substrate = list(Sample_substrate)) %>% 
  ggplot(aes(x = Sample_substrate)) +
    geom_bar(color = "black", width = 0.8, aes(fill = Supergroup)) +
    scale_x_upset(n_intersections = 30)
## `summarise()` has grouped output by 'FeatureID', 'Supergroup', 'VENT'. You can
## override using the `.groups` argument.
## `summarise()` has grouped output by 'FeatureID'. You can override using the
## `.groups` argument.
## Warning: Removed 282 rows containing non-finite values (stat_count).

5 Import temperature data and other information for each microcolonizer

ibuttons deployed with each microcolonizer.

sample_list <- read.delim("input-data/MC-samplelist.txt")
sample_metadata <- read.delim("input-data/MC-collect-info.txt")
amy <- c("Quartz1", "Basalt", "Olivine", "Pyrite")
sample_list_complete <- sample_list %>% 
  left_join(sample_metadata) %>% 
  separate(Sample_type, c("Sample_material", "Sample_type"), sep = "-") %>% 
  mutate(sample_purpose = case_when(
    (Sample_material %in% amy & Sample_type == "DNA") ~ "16S",
    (!(Sample_material %in% amy) & Sample_type == "DNA") ~ "18S",
    (Sample_material %in% amy & Sample_type == "SEM") ~ "Prok SEM",
    (!(Sample_material %in% amy) & Sample_type == "SEM") ~ "Euk SEM"
  ))
## Joining, by = "MC_ID"
# write_delim(sample_list_complete, path = "sample-list-MC.txt", delim = "\t")

5.1 Process ibutton data

list_mc <- read.delim("input-data/microcolonizer_list.txt")
head(list_mc)
##   ibuttons.num Deploy.date Deploy.time..UTC. Recover.date Nautilus.Sample.ID
## 1            1      29-May              8:30        5-Jun          NA108-076
## 2            2      29-May              8:30        4-Jun          NA108-045
## 3            3      29-May              8:30        4-Jun          NA108-046
## 4            4      29-May              8:30        5-Jun          NA108-077
## 5            5      29-May              8:30        6-Jun          NA108-097
## 6            6      29-May              8:30        6-Jun          NA108-098
##    Dive Vent.site.name Recover.time..UTC..start Recover.time..UTC..end     Lat
## 1 H1754     Mt Edwards                     1819                   1822 42.7547
## 2 H1753     Mt Edwards                     2004                   2011 42.7547
## 3 H1753     Mt Edwards                     2011                   2015 42.7547
## 4 H1754     Mt Edwards                     1823                   1824 42.7548
## 5 H1755     Mt Edwards                     2228                   2231 42.7547
## 6 H1755     Mt Edwards                     2231                   2231 42.7547
##       Long Depth..m. Temp..C..Herc Salinigyt.PSU Oxygen..μmoles.L.
## 1 -126.709  2706.768       1.79985       34.6189          64.63451
## 2 -126.709  2707.849       1.95566       34.6527          64.93543
## 3 -126.709  2707.938       4.84292       34.4767          66.27680
## 4 -126.709  2706.780       5.55486       34.6322          68.08677
## 5 -126.709  2707.796       3.29768       34.5240          65.41028
## 6 -126.709  2707.852       2.84660       34.5571          67.84714
##   Corrected.O2data..x0.813.
## 1                   52.5479
## 2                   52.7925
## 3                   53.8830
## 4                   55.3545
## 5                   53.1786
## 6                   55.1597

Deployment of all Microcolonizers was at 2019-05-29, 20:59:57.659 (UTC) (also 8:59 PM UTC), which is 4:59PM EST or 16:59 PM EST

Function to import all ibutton data raw and process.

# Recovered microcolonizers by ibutton IDs:
## 1, 2, 3, 4, 5, 6
recovered <- c("2019-06-05 14:19:00","2019-06-04 16:04:00","2019-06-04 16:11:00","2019-06-05 14:23:00","2019-06-06 18:28:00","2019-06-06 18:31:00")
# Sys.timezone()
# tmp_2 <- logger2 %>% add_column(MC = 2)
# 
# logger1 <- read.csv("input-data/Logger1Data.csv", skip = 19)
# logger2 <- read.csv("input-data/Logger2Data.csv", skip = 19)
# x <- 1
mc_ids <- c(1, 2, 3, 4, 5, 6)

for(num in mc_ids){
  log_file <- read.csv(paste("input-data/Logger", num, "Data.csv", sep = ""), skip = 19)  
  cat("Reading in log file number", num, "\n")
  log_out <- log_file %>% 
    add_column(MC = num) %>% 
    mutate(Parsed_time_EST = parse_date_time(Date.Time, "%m/%d/%y %H:%M:%S p", tz = "America/New_York")) %>%
    # Filter out irrelevant date before deployment
    filter(Parsed_time_EST > "2019-05-29 18:59:00") %>% 
    filter(
      Parsed_time_EST < recovered[[num]]
    )
  if(!exists("log_files_all")){
    log_files_all <- log_out
  } else{
    log_files_all <- bind_rows(log_files_all, log_out)
  }
}
## Reading in log file number 1 
## Reading in log file number 2 
## Reading in log file number 3 
## Reading in log file number 4 
## Reading in log file number 5 
## Reading in log file number 6
# rm(log_out); rm(log_files_all)
# head(log_out)
# View(log_files_all)


# Factor by colors and pairs of MCs
log_files_all$MC_ORDER <- factor(log_files_all$MC, levels = mc_ids)
mc_col <- c("#d7301f", "#4a1486", "#9e9ac8", "#fc8d59", "#2171b5", "#6baed6")
names(mc_col) <- mc_ids

Graph microcolonizer temperatures.

ggplot(log_files_all, aes(x = Parsed_time_EST, y = Value, group = as.factor(MC_ORDER), color = as.factor(MC_ORDER))) +
  geom_path() +
  scale_color_manual(values = mc_col) +
  theme_classic(base_size = 14) +
  labs(x = "", y = "Temperature •C") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
        legend.title = element_blank())

5.2 Temperature data

ggplot(log_files_all, aes(x = Parsed_time_EST, y = Value, color = as.factor(MC_ORDER))) +
  geom_step() +
  scale_color_manual(values = mc_col) +
  theme_classic(base_size = 14) +
  labs(x = "", y = "Temperature •C") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
        legend.title = element_blank())

# ?geom_smooth

6 Session end

sessionInfo()
## R version 4.1.0 (2021-05-18)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: macOS Mojave 10.14.6
## 
## Matrix products: default
## BLAS/LAPACK: /Users/sarahhu/anaconda3/envs/r_4.1/lib/libopenblasp-r0.3.15.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] ggupset_0.3.0   lubridate_1.8.0 plotly_4.10.0   decontam_1.12.0
##  [5] phyloseq_1.36.0 cowplot_1.1.1   forcats_0.5.1   stringr_1.4.0  
##  [9] dplyr_1.0.9     purrr_0.3.4     readr_2.1.1     tidyr_1.2.0    
## [13] tibble_3.1.7    ggplot2_3.3.6   tidyverse_1.3.1
## 
## loaded via a namespace (and not attached):
##  [1] nlme_3.1-155           bitops_1.0-7           fs_1.5.2              
##  [4] RColorBrewer_1.1-3     httr_1.4.3             GenomeInfoDb_1.28.4   
##  [7] tools_4.1.0            backports_1.4.1        bslib_0.3.1           
## [10] vegan_2.5-7            utf8_1.2.2             R6_2.5.1              
## [13] lazyeval_0.2.2         mgcv_1.8-38            DBI_1.1.2             
## [16] BiocGenerics_0.38.0    colorspace_2.0-3       permute_0.9-5         
## [19] rhdf5filters_1.4.0     ade4_1.7-18            withr_2.5.0           
## [22] tidyselect_1.1.2       compiler_4.1.0         cli_3.3.0             
## [25] rvest_1.0.2            Biobase_2.52.0         xml2_1.3.3            
## [28] labeling_0.4.2         sass_0.4.0             scales_1.2.0          
## [31] digest_0.6.29          rmarkdown_2.11         XVector_0.32.0        
## [34] pkgconfig_2.0.3        htmltools_0.5.2        highr_0.9             
## [37] dbplyr_2.1.1           fastmap_1.1.0          htmlwidgets_1.5.4     
## [40] rlang_1.0.2            readxl_1.3.1           rstudioapi_0.13       
## [43] farver_2.1.0           jquerylib_0.1.4        generics_0.1.2        
## [46] jsonlite_1.8.0         crosstalk_1.2.0        RCurl_1.98-1.5        
## [49] magrittr_2.0.3         GenomeInfoDbData_1.2.6 biomformat_1.20.0     
## [52] Matrix_1.4-0           Rcpp_1.0.8             munsell_0.5.0         
## [55] S4Vectors_0.30.2       Rhdf5lib_1.14.2        fansi_1.0.3           
## [58] ape_5.6-2              lifecycle_1.0.1        stringi_1.7.6         
## [61] yaml_2.2.2             MASS_7.3-57            zlibbioc_1.38.0       
## [64] rhdf5_2.36.0           plyr_1.8.7             grid_4.1.0            
## [67] parallel_4.1.0         crayon_1.5.1           lattice_0.20-45       
## [70] splines_4.1.0          Biostrings_2.60.2      haven_2.4.3           
## [73] multtest_2.48.0        hms_1.1.1              knitr_1.37            
## [76] pillar_1.7.0           igraph_1.3.1           reshape2_1.4.4        
## [79] codetools_0.2-18       stats4_4.1.0           reprex_2.0.1          
## [82] glue_1.6.2             evaluate_0.14          data.table_1.14.2     
## [85] modelr_0.1.8           vctrs_0.4.1            tzdb_0.2.0            
## [88] foreach_1.5.1          cellranger_1.1.0       gtable_0.3.0          
## [91] assertthat_0.2.1       xfun_0.29              broom_0.7.11          
## [94] viridisLite_0.4.0      survival_3.2-13        iterators_1.0.13      
## [97] IRanges_2.26.0         cluster_2.1.2          ellipsis_0.3.2